##################################################################
##################################################################
## Replication Material
## Stefan Müller: The Temporal Focus of Campaign Communication
## The Journal of Politics
## stefan.mueller@ucd.ie
##
## Script 4: Results reported in SI Section A
##################################################################
##################################################################

# Note: The file description_replication_material_jop_mueller.pdf describes the purpose of this 
# file in detail and lists the names and sources of all datasets 
# used in this script

# This script was run on the following R version, platform and OS:
# R version 3.6.0 (2019-04-26)
# Platform: Platform: x86_64-apple-darwin15.6.0 (64-bit)
# Running under: macOS Catalima 10.15.5

# load packages required to run this script
library(dplyr)   # CRAN v1.0.0
library(tidyr)   # CRAN v1.1.0
library(ggplot2) # CRAN v3.3.2
library(scales)  # CRAN v1.1.1

# create custom ggplot2 scheme
theme_baser <- function (){
  theme_minimal()  %+replace%
    theme(panel.grid.minor.x = element_blank(),
          panel.grid.minor.y = element_blank(),
          panel.grid.major.x = element_blank(),
          panel.grid.major.y = element_blank(),
          panel.border = element_rect(fill = NA,color = "black", size = 0.5,
                                      linetype = "solid"),
          legend.title = element_text(size = 15),
          plot.title = element_text(size = 15, face = "italic",
                                    vjust = 1.5, hjust = 0,
                                    margin=margin(0, 0, 12 ,0)),
          legend.position = "bottom",
          axis.ticks = element_line(size = 0.3),
          axis.ticks.length = unit(0.2, "cm"),
          legend.text=element_text(size = 13),
          strip.text = element_text(size = 15, hjust = 0.5,
                                    margin = margin(b = 5, r = 5, l = 5, t = 5)),
          axis.text = element_text(colour = "black", size = 13),
          axis.title = element_text(size = 13, hjust = 0.5))
}

# set theme
theme_set(theme_baser())

dat_combined_all <- readRDS("data_manifestos_classified.rds")

# remove all sentences with more than 99 tokens
dat_combined <- filter(dat_combined_all, ntoken < 100)

dat_combined$year <- as.numeric(dat_combined$year)
dat_combined$class <- factor(dat_combined$class)


# plot manifesto availability
dat_sum_manifestos_range <- dat_combined %>% 
  select(language_capital, countryname, manifesto_id, annotations, year) %>%
  unique() %>% 
  group_by(language_capital, countryname) %>% 
  summarise(year_min = min(year, na.rm = TRUE),
            year_max = max(year, na.rm = TRUE)) 

dat_sum_manifestos <- dat_combined %>% 
  select(language_capital, countryname, manifesto_id, annotations, year) %>%
  unique() %>% 
  group_by(language_capital, countryname, annotations) %>% 
  summarise(n = n()) %>% 
  mutate(relfreq = n / sum(n)) %>% 
  mutate(annotated = ifelse(annotations == "TRUE", "Quasi-sentences",
                            "Natural sentences")) %>% 
  left_join(dat_sum_manifestos_range) %>% 
  mutate(countryname_years = paste0(countryname, "\n(", year_min, "-", year_max, ")"))


## Figure A02 ----
ggplot(dat_sum_manifestos, aes(x = annotated, 
                               y = relfreq,
                               fill = annotated)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = n), nudge_y = -0.05,
            colour = "white", size = 3) +
  facet_wrap(~countryname_years) +
  scale_fill_manual(values = c("grey50", "black")) +
  coord_flip() +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  labs(x = NULL, y = "Percent") +
  theme(legend.position = "none")
ggsave("fga02.pdf", width = 10, height = 5)


# calculate the average number of sentences per manifestos
dat_sum_types <- dat_combined %>% 
    group_by(language_capital, manifesto_id, annotations, class) %>%
    summarise(mean_token = mean(ntoken)) %>% 
    mutate(annotated = ifelse(annotations == "TRUE", "Quasi-sentences",
                              "Natural sentences")) 

dat_sum_types$class <- factor(dat_sum_types$class,
                              levels = c("Future", "Present", "Past"))


# Figure A03 ----
ggplot(dat_sum_types, aes(x = class, y = mean_token)) + 
    geom_boxplot(outlier.colour = "white", colour = "red") +
    ggbeeswarm::geom_quasirandom(alpha = 0.1) +
    facet_grid(annotated~language_capital) +
    coord_flip() +
    labs(x = "Predicted class", y = "Average sentence length (on the level of manifestos)")
ggsave("fga03.pdf", 
       width = 10, height = 4.5)
